Skripri - Feed Parsing Engine

Ceritanya list sudah ada di collection MongoLab, tetapi untuk testing digunakan inisiasi link manual


In [1]:
import feedparser
import sys
import time
from pymongo import MongoClient

In [1]:
# server = 'localhost'
# port = 27017
# db_name = 'thosangs-news'
# username = 'userSkripsi'

In [3]:
import pymongo
print 'Python version', sys.version
print 'Pymongo version', pymongo.version

# connect to server
print '\nConnecting ...'
conn = MongoClient(server, port)

# Get the database
print '\nGetting database ...'
db = conn[db_name]

# Have to authenticate to get access
print '\nAuthenticating ...'
db.authenticate(username, password)


Python version 2.7.10 (default, Oct 14 2015, 16:09:02) 
[GCC 5.2.1 20151010]
Pymongo version 3.1.1

Connecting ...

Getting database ...

Authenticating ...
Out[3]:
True

for i in range(len(feeds)): print(feeds[i][1])

pos = db.newsLink pos.insert_many([{ 'name' : feeds[i][0],'link' : feeds[i][1]} for i in range(len(feeds))])


In [4]:
link = db.newsLink.find()
feeds = []
for l in link:
    dummy = {}
    dummy['name'] = l['name']
    dummy['link'] = l['link']
    feeds.append(dummy)

In [29]:
doc = dict()
for i in range(len(feeds)):
    start_time = time.time()
    doc[feeds[i]['name']] = feedparser.parse(feeds[i]['link'])['entries']
    for j in range(len(doc[feeds[i]['name']])) : doc[feeds[i]['name']][j].pop('published_parsed') 
    print '{0} {1}-News {2}-Seconds'.format(feeds[i]['name'],len(doc[feeds[i]['name']]),(time.time()-start_time))


detik 20-News 0.70645403862-Seconds
viva 200-News 0.395403146744-Seconds
merdeka 100-News 0.205430984497-Seconds
liputan6 100-News 0.323644161224-Seconds
tribun 20-News 0.140833854675-Seconds
okezone 50-News 0.112392902374-Seconds
jpnn 16-News 0.112134933472-Seconds
suara 20-News 0.110912084579-Seconds
bisniscom 20-News 0.132827043533-Seconds

In [26]:
pos = db.news
for linkBerita in doc:
    for berita in doc[linkBerita]:
        if (pos.find({'link' : berita['link']})):
            doc[linkBerita].remove(berita)
        else:
            print 'TIDAK '+berita['link']

In [31]:
dd = pos.find({'link' : doc['viva'][1]['link']})
print doc['detik'][1]['link']


http://detik.feedsportal.com/c/33613/f/656089/s/4e8048a9/sc/3/l/0Lfinance0Bdetik0N0Cread0C20A160C0A30C250C1120A170C31730A410C40Cakhir0Emaret0Einka0Eekspor0E150Ekereta0Emade0Ein0Emadiun0Eke0Ebangladesh/story01.htm

In [28]:
pos.insert_many([doc[feeds[i]['name']][j] for i in range(len(feeds)) for j in range(len(doc[feeds[i]['name']]))])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-28-f351c3ec5120> in <module>()
----> 1 pos.insert_many([doc[feeds[i]['name']][j] for i in range(len(feeds)) for j in range(len(doc[feeds[i]['name']]))])

/home/blank/.local/lib/python2.7/site-packages/pymongo/collection.pyc in insert_many(self, documents, ordered)
    577         """
    578         if not isinstance(documents, collections.Iterable) or not documents:
--> 579             raise TypeError("documents must be a non-empty list")
    580         inserted_ids = []
    581         def gen():

TypeError: documents must be a non-empty list

In [ ]:
d = pos.find

In [11]:
#testing ekstrak deskripsi singkat detik.com
def FindShortDesc(desc,cekawal,cekakhir):
    cek = desc
    awal  = cek.find(cekawal)+len(cekawal)
    akhir = cek.find(cekakhir)
    return cek[awal:akhir]

for j in range(0,len(ds)):
    judul = ds.title[j]
    shortdesc = FindShortDesc(ds.summary[j],'width="100" />','<br c')
    img = FindShortDesc(ds.summary[j],'src="','" width')
    link = ds.id[j]
    print judul
    print shortdesc
    print img
    print link+'\n'


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-f528975bfb91> in <module>()
      6     return cek[awal:akhir]
      7 
----> 8 for j in range(0,len(ds)):
      9     judul = ds.title[j]
     10     shortdesc = FindShortDesc(ds.summary[j],'width="100" />','<br c')

NameError: name 'ds' is not defined

In [6]:
print(FindShortDesc(ds.summary[0],'src="','" width'))


http://images.detik.com/content/2015/11/26/4/sawah.jpeg